-
Notifications
You must be signed in to change notification settings - Fork 15.3k
AMDGPU: Handle s_add_u32 in eliminateFrameIndex #129628
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesWe can fold frame indexes directly into existing immediate operands, This avoids, but does not address a failure exposed after Patch is 30.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129628.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7d6990c097774..128cd8244a477 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2713,7 +2713,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
return true;
}
- case AMDGPU::S_ADD_I32: {
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_U32: {
// TODO: Handle s_or_b32, s_and_b32.
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
@@ -2773,7 +2774,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
DstReg = TmpReg;
}
- auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
+ auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
.addDef(DstReg, RegState::Renamable)
.addReg(MaterializedReg, RegState::Kill)
.add(OtherOp);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index c3b48b5d2ddff..378c6312c52be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -142,13 +142,12 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: v_mov_b32_e32 v0, s48
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:240
; GCN-NEXT: v_mov_b32_e32 v0, s49
-; GCN-NEXT: s_and_b32 s4, s25, 63
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:244
; GCN-NEXT: v_mov_b32_e32 v0, s50
-; GCN-NEXT: s_lshl_b32 s4, s4, 2
+; GCN-NEXT: s_and_b32 s4, s25, 63
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:248
; GCN-NEXT: v_mov_b32_e32 v0, s51
-; GCN-NEXT: s_add_u32 s4, 0, s4
+; GCN-NEXT: s_lshl_b32 s4, s4, 2
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:252
; GCN-NEXT: v_mov_b32_e32 v0, s24
; GCN-NEXT: v_mov_b32_e32 v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-u32.mir
new file mode 100644
index 0000000000000..af61bd70f16b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-u32.mir
@@ -0,0 +1,123 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW32 %s
+
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s
+
+---
+name: s_add_u32__inline_imm__fi_offset0
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 32, alignment: 16 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; MUBUFW64-LABEL: name: s_add_u32__inline_imm__fi_offset0
+ ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+ ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_U32 12, $sgpr4, implicit-def dead $scc
+ ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7
+ ;
+ ; MUBUFW32-LABEL: name: s_add_u32__inline_imm__fi_offset0
+ ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc
+ ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_U32 12, $sgpr4, implicit-def dead $scc
+ ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7
+ ;
+ ; FLATSCRW64-LABEL: name: s_add_u32__inline_imm__fi_offset0
+ ; FLATSCRW64: renamable $sgpr7 = S_ADD_U32 12, $sgpr32, implicit-def dead $scc
+ ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7
+ ;
+ ; FLATSCRW32-LABEL: name: s_add_u32__inline_imm__fi_offset0
+ ; FLATSCRW32: renamable $sgpr7 = S_ADD_U32 12, $sgpr32, implicit-def dead $scc
+ ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7
+ renamable $sgpr7 = S_ADD_U32 12, %stack.0, implicit-def dead $scc
+ SI_RETURN implicit $sgpr7
+
+...
+
+---
+name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 96, alignment: 16 }
+ - { id: 1, size: 128, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; MUBUFW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+ ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW64-NEXT: {{ $}}
+ ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW64-NEXT: renamable $sgpr7 = S_MOV_B32 164
+ ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7
+ ;
+ ; MUBUFW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+ ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW32-NEXT: {{ $}}
+ ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW32-NEXT: renamable $sgpr7 = S_MOV_B32 164
+ ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7
+ ;
+ ; FLATSCRW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+ ; FLATSCRW64: renamable $sgpr7 = S_MOV_B32 164
+ ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7
+ ;
+ ; FLATSCRW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+ ; FLATSCRW32: renamable $sgpr7 = S_MOV_B32 164
+ ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7
+ renamable $sgpr7 = S_ADD_U32 68, %stack.1, implicit-def dead $scc
+ SI_RETURN implicit $sgpr7
+...
+
+---
+name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 96, alignment: 16 }
+ - { id: 1, size: 128, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; MUBUFW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+ ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW64-NEXT: {{ $}}
+ ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+ ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+ ;
+ ; MUBUFW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+ ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW32-NEXT: {{ $}}
+ ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+ ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+ ;
+ ; FLATSCRW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+ ; FLATSCRW64: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+ ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+ ;
+ ; FLATSCRW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+ ; FLATSCRW32: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+ ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+ renamable $sgpr7 = S_ADD_U32 68, %stack.1, implicit-def $scc
+ SI_RETURN implicit $sgpr7, implicit $scc
+...
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 346b69c362c04..96d0e383761d1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -38,7 +38,6 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v3, 2, v0
@@ -76,12 +75,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v0, s0, v0
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0
@@ -113,8 +109,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
@@ -168,7 +163,6 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -207,11 +201,9 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
@@ -246,11 +238,9 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
@@ -304,7 +294,6 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -343,11 +332,9 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
@@ -382,11 +369,9 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
@@ -440,7 +425,6 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v3, 2, v0
@@ -483,8 +467,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
@@ -520,8 +503,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
@@ -576,7 +558,6 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -616,11 +597,10 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -657,11 +637,10 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
@@ -717,7 +696,6 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -757,11 +735,10 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -798,11 +775,10 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT:...
[truncated]
|
Merge activity
|
d70a17f to
7ef23b8
Compare
We can fold frame indexes directly into existing immediate operands, just like is already done for s_add_i32. We happen to use s_add_i32 in the 32-bit add case, but s_add_u32 appears in the a 64-bit add sequence of a flat pointer if an addrpacecast source is a frame index. This avoids, but does not address a failure exposed after a316539 where two literal operands end up in the final instruction. The underlying issue still exists for some instructions without special handling in eliminateFrameIndex.
7ef23b8 to
62bfd5c
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/77/builds/9661 Here is the relevant piece of the build log for the reference |

We can fold frame indexes directly into existing immediate operands,
just like is already done for s_add_i32. We happen to use s_add_i32 in
the 32-bit add case, but s_add_u32 appears in the a 64-bit add sequence
of a flat pointer if an addrpacecast source is a frame index.
This avoids, but does not address a failure exposed after
a316539 where two literal operands
end up in the final instruction. The underlying issue still exists for
some instructions without special handling in eliminateFrameIndex.